# We have triennial household counts from the SCF, but we need quarterly 
# household counts for the DFAs to run. We handle this by linearly interpolating
# between the points we do have, and then to extrapolate, we use the pattern
# implied by the change in the quarterly CPS household count 

# if this is not being run inside the larger DFA program, get the necessary file paths and functions
if (exists("TEMPDISAGG") == FALSE) {
  
  
  home_directory = "[your home directory]" # change this to your home directory
  scf_directory <- file.path(home_directory, "/inputs") # where the files from MEcS are located
  output_directory <- file.path(home_directory, "/outputs")
  master_scripts_directory <- file.path(home_directory, "/DFAs_construction_code") # where the program sources DFA functions
  
  source(file.path(master_scripts_directory, "functions.R"))
  
  # set group here if you are not running this inside the larger DFA program
  group <- "generation"
  agg_level <- "default"
  
  fofnet_path <- get_fofnet(message = FALSE)
  date_series <- fame2df("PC073164013.Q", fofnet_path)
  lastqtr <- max(date_series$date)
  interpqtrs <- as.yearqtr(seq.Date(as.Date("1989-09-30"),lastqtr, by = "quarter"))
  
  settings <- dfa_splits(group = group, agg_level = agg_level)
  groupnames <- settings[["groupnames"]]
  scf.recode <- settings[["scf.recode"]]
  input.file <- settings[["input.file"]]
  tag <- settings[["tag"]]
  
  filepath <- file.path(scf_directory, paste0(input.file, ".xlsx"))
  
}

# Read in SCF household count triennial data  ---------------------------------------

sheet_names <- excel_sheets(filepath)
sheet_names <- sapply(strsplit(sheet_names," "), `[`, 1)

scf_hhcount <- read_excel(filepath, 
                          sheet = which(sheet_names == "households"), 
                          trim_ws = TRUE, 
                          col_types = c("numeric", "numeric", "numeric", "skip", "skip", "skip", "skip"), 
                          col_names = c("date", "cat", "hhcount")) %>%
  drop_na(cat) %>% # sometimes there are random extra rows, indicated by NA in the cat column
  mutate(cat = recode(cat, !!!scf.recode)) %>% # relabel groups from MEcS files
  mutate(cat = factor(cat, levels = groupnames)) %>% # make the groups a factor instead of character for better organization
  group_by(date,cat) %>%
  dplyr::summarise_all(sum) %>% # add up all the groups within each group we are looking to distribute (e.g. Bottom50 = 1 through 5 in the wealth file)
  arrange(date,cat) %>%
  ungroup() %>% 
  mutate(date = as.yearqtr(paste0(date, "q3")))

lastscf <- last(scf_hhcount$date)


# Interpolate for quarters until last SCF ---------------------------------

hhcount_interp <-   scf_hhcount %>% 
  pivot_wider(id_cols = date, names_from = cat, values_from = hhcount) %>%
  complete(date = interpqtrs) %>% # add NA values for all the quarters you want to interpolate over
  filter(date <= lastscf) %>% # only want to interpolate for quarters before the last scf
  mutate_if(is.numeric, ~spline(x = date, y = ., xout = interpqtrs[interpqtrs <= lastscf])$y)  # apply the cubic spline
# pivot_longer(-date, names_to = "cat", values_to = "hhcount") %>% 

# Read in CPS data --------------------------------------------------------

# for groups that are formed based off percentiles, just calculate percentile household counts using total CPS households 
if (group %in% c("networth", "income", "nipainc", "buildnipainc")) {
  
  agg_vec <- settings[["agg_vec"]]
  
  cps_hhcount <- read_csv(file.path(home_directory, "/inputs/networth_cps_hh.csv")) %>% # maybe in the future ask for just the total household counts saved instead of using networth but it doesn't matter
    dplyr::rename("date" = quarter) %>% 
    mutate_at(vars(-date), ~.*1000) %>% # put into households rather than thousands of households
    mutate(date = as.yearqtr(date)) %>%
    pivot_longer(-date, names_to = "cat", values_to = "hhcount") %>% 
    group_by(date) %>% 
    dplyr::summarise(hhcount = sum(hhcount)) # this gives the total CPS household count (could get this saved in the folder to read in directly)
  
  
  for (i in 1:length(agg_vec)) {
    cps_hhcount[[paste0("cat", i)]] <- cps_hhcount$hhcount * agg_vec[i]
  }
  
  names(cps_hhcount)[3:(2+length(agg_vec))] <- groupnames
  
  cps_hhcount <- cps_hhcount %>% select(-hhcount) %>% filter(date >= lastscf)
  
} else {
  cps_hhcount <- read_csv(file.path(home_directory, "/inputs", paste0(group, "_cps_hh.csv"))) %>% 
    dplyr::rename("date" = quarter) %>% 
    mutate_at(vars(-date), ~.*1000) %>% # put into households rather than thousands of households
    mutate(date = as.yearqtr(date)) %>%
    filter(date >= lastscf)
  
}

# Extrapolate with CPS data -----------------------------------------------

hhcount_extrap <- dplyr::full_join(hhcount_interp %>% dplyr::filter(date == lastscf), cps_hhcount, by = "date", suffix = c(".scf", ".cps")) %>% 
  tidyr::pivot_longer(-date, values_to = "hhcount") %>% 
  tidyr::separate(name, into = c("cat", "dataset"), sep = "[.]") %>% 
  tidyr::pivot_wider(names_from = "dataset", values_from = "hhcount") %>%
  dplyr::group_by(cat) %>% 
  
  # Step 1: Grow 2019q3 numbers with CPS growth rates
  dplyr::mutate(
    cps_growth_rate = case_when(
      date == lastscf ~ 1,
      date > lastscf ~ (cps - dplyr::lag(cps, order_by = date)) / dplyr::lag(cps, order_by = date) + 1
    ),
    hhcount = case_when(
      date == lastscf ~ scf,
      date > lastscf  ~ scf[date == lastscf] * cumprod(cps_growth_rate)
    )
  ) %>% 
  
  # Step 2: Normalize household counts across splits
  # this process makes the sum of all the household counts equal to the sum of the CPS household counts
  
  dplyr::ungroup() %>% 
  dplyr::group_by(date) %>% 
  dplyr::mutate(scf_total = sum(hhcount),
                cps_total = sum(cps),
                hhcount = hhcount * cps_total / scf_total 
  ) %>% 
  
  # Step 3: Adjust for 2019q3 difference in total household count 
  
  dplyr::ungroup() %>% 
  dplyr::mutate(hhcount = hhcount * scf_total[date == lastscf] / cps_total[date == lastscf]) %>% 
  
  # Reformat 
  dplyr::select(date, cat, hhcount) %>% 
  tidyr::pivot_wider(names_from = "cat", values_from = "hhcount") %>% 
  dplyr::filter(date > lastscf)

# Join extrapolated and interpolated household counts ---------------------

hhcount <- bind_rows(hhcount_interp, hhcount_extrap)

# Save as a CSV -----------------------------------------------------------

write_csv(hhcount, file.path(output_directory, paste0("hhcount_", group, "_", agg_level, filetag, ".csv")))


